import pandas as pd
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--sample",'-s',help='Sample instead of full?')
args = parser.parse_args()

'''
#############
##### For now, sample
##############
if args.sample:
    print(args.sample)
    sample = (args.sample=='True')
else:
    sample = True
print('Sampling: '+str(sample))
'''
sample = False



################
#### start with ind_return_data (taxpayer_id type = 0)
################

ind_return_data=pd.read_csv('/REDACTED/data/raw/pulledData/ind_return_data_2014.csv') 
if ind_return_data.groupby('taxpayer_id').taxpayer_id.count().max() == 1: 
    print('ind_return_data taxpayer_ids are unique')
else:
    ind_return_data=ind_return_data.drop_duplicates('taxpayer_id', keep='first')
    print('ind_return_data taxpayer_ids not unique; dropped dupe taxpayer_ids.')

### ind_return_data taxpayer_ids not unique

#######################################################
#### merge in new BIFSG 
#######################################################

op_new = pd.read_stata('/REDACTED/BIFSG/BIFSG_TY2014.dta')
op_new.rename(columns = {'taxpayer_id':'taxpayer_id_new', 'taxpayer_id_old':'taxpayer_id'}, inplace = True)
op_new.columns = op_new.columns.str.replace("pprob_", "predicted_prob_")

if op_new.groupby('taxpayer_id').taxpayer_id.count().max() == 1: 
    print('op_new taxpayer_ids are unique')
else:
    op_new=op_new.drop_duplicates('taxpayer_id', keep='first')
    print('op_new taxpayer_ids not unique; dropped dupe taxpayer_ids.')

### op_new taxpayer_ids are unique


###############
## Merge with audit_data
################


audit_data=pd.read_csv('/REDACTED/data/clean/audit_data_2014_auditlevel.csv')
assert(audit_data.groupby('taxpayer_id').tax_period.nunique().max() == 1)
# remove redundant varbs (have tax prd from soi_data, dif score from ind_return_data)
audit_data = audit_data[[x for x in audit_data.columns.tolist() if (x != 'tax_period') and (x != 'irs_risk_score')]]



################
#### Merge with soi_data
#################

soi_data=pd.read_csv('/REDACTED/data/raw/pulledData/soi_data_2014.csv') 
assert(soi_data.groupby('taxpayer_id').taxpayer_id.count().max()==1)
soi_data=soi_data[soi_data.taxpayer_id == soi_data.taxpayer_id_pe]
# remove redundant varbs (have fil stat from ind_return_data)
soi_data = soi_data[[x for x in soi_data.columns.tolist() if (x != 'filing_status')]]


##############
#### merge with dep_database
##############
dep_database=pd.read_csv('/REDACTED/data/final/dep_database_individual_wrace_audits.csv', usecols=['taxpayer_id', 'irs_dep_risk_score'])
dep_database['indep_database']=1
if dep_database.groupby('taxpayer_id').taxpayer_id.count().max() == 1: 
    print('dep_database taxpayer_ids are unique')
else:
    dep_database = dep_database.drop_duplicates('taxpayer_id', keep='first')
    print('dep_database taxpayer_ids not unique; dropped dupe taxpayer_ids.')    

### dep_database taxpayer_ids are unique

###################
#### Merge with collections_data
##################
collections_data=pd.read_csv('/REDACTED/data/clean/collections_data_2014_clean.csv')



################
### MERGE IN DATASETS
################

### ind_return_data <- new BIFSG
merged=pd.merge(ind_return_data, op_new, on='taxpayer_id', how='left', indicator='ind_return_data_to_tp')

### new_merged <- audit_data
merged=pd.merge(merged, audit_data, how='left', on='taxpayer_id', indicator=True)	
merged['audited']=(merged._merge == 'both').astype(int)

### new_merged <- soi_data
merged = pd.merge(merged,soi_data,on='taxpayer_id',how='left',indicator='tp_to_db')

merged['aud_no_research_audits_old'] = ((merged.audited) & ~(merged.audit_source_code.isin([80,91]))).astype(int)
merged['aud_no_research_audits'] = [1 if (x.find('[80]') == -1)
                         and (x.find(' 80]') == -1)
                         and (x.find('[80 ') == -1)
                         and (x.find('[91]') == -1)
                         and (x.find(' 91]') == -1)
                         and (x.find('[91 ') == -1)
                         and y == 1
                         else 0
                         for x, y in zip(merged.audit_source_code.astype(str), merged.audited)]
merged['filing_jointly']=merged.filing_status == 2
merged['isM']=merged.gender_ind == 'M'

### new_merged <- dep_database
merged = pd.merge(merged, dep_database, on='taxpayer_id', how='left', indicator='tp_to_dep_database')
merged.project = merged.project.astype(str)
merged.project = [x.replace('[', '') for x in merged.project]
merged.project = [x.replace(']', '') for x in merged.project]
merged.project = [x.split() for x in merged.project]
merged.project = [[item.zfill(4) for item in x] for x in merged.project]
merged['dep_database_aud'] = [1 if x == 1 
                        and y == 1 
                        and '0173' not in z
                        and '0584' not in z
                        and '0611' not in z
                        and '0263' not in z
                        and '0390' not in z
                        and '0603' not in z
                        and '1082' not in z
                        and '1162' not in z
                        else 0
                        for x, y, z in zip(merged.indep_database, merged.aud_no_research_audits, merged.project)]

### new_merged <- collections_data
merged=pd.merge(merged, collections_data[['taxpayer_id', 'tax_period', 'pre_refund', 'pre_refund_non_eitc', 'pre_refund_eitc']], on=['taxpayer_id', 'tax_period'], how='left', indicator='tp_to_collections_data')
merged['pre_refund']=merged['pre_refund'].fillna(0)
merged['pre_refund_non_eitc']=merged['pre_refund_non_eitc'].fillna(0)
merged['pre_refund_eitc']=merged['pre_refund_eitc'].fillna(0)
merged['post_ref']=[1 if x == 1 and y == 0 else 0 for x, y in zip(merged.aud_no_research_audits, merged.pre_refund)]
merged['employee_code']=merged.employee_code.astype(str)
merged['corr_aud'] = [1 if x.find('[5]') != -1
                        or x.find('[5 ') != -1
                        or x.find(' 5]') != -1
                        or x.find('5') != -1
                        else 0 for x in merged.employee_code]
merged['non_corr_aud'] = [1 if x == 1 and y == 0 else 0 for x,y in zip(merged.aud_no_research_audits, merged.corr_aud)]

merged['taxpayer_id_typ'] = 0

############
### merge updated eic def
############

merged = merged[~merged.predicted_prob_black.isna()]
extract = pd.read_stata('/REDACTED/HertzGraff/cam_eic_extract.dta')
extract = extract.rename(columns={"taxpayer_id":"taxpayer_id_new", "taxpayer_id_typ":"taxpayer_id_typ_new", "cycle_posted":"cycle_posted_new", "eic":"eic_new", "eitc_amt_computer":"eitc_amt_computer_new"})
merged = merged.merge(extract, on='taxpayer_id_new', how='left')
merged['isEIC'] = np.where(merged['eic_new'] > 0, 1, 0) 

##############
####Write out
##############

merged = merged[[x for x in merged.columns.tolist() if 'unnamed' not in x]]

merged.to_csv('/REDACTED/data/final/individualBISG2014_full_final.csv', index=False)
